# Import packages
import pandas as pd
import numpy as np
import datetime as dt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import matplotlib.pyplot as plt
import networkx as nx
import plotly.express as px
import plotly.graph_objects as go
#data = pd.read_csv('C:/Users/migue/Desktop/data_preprocess.csv')
#data['TID']=data['Point-of-Sale_ID'].astype(str)+data['ProductFamily_ID'].astype(str)+data['Date'].astype(str)
#data['Year'] = data['Date'].astype(str).str[:4]
#data['Year']=data['Year'].astype(int)
#df = data[['ProductName_ID','Point-of-Sale_ID','Year','Quarter','TID']].copy()
#df.to_csv('C:/Users/migue/Desktop/mba_df.csv',index=False)
#"C:/Users/Pedro/Desktop/Business Cases/BC5/Datasets/mba_dash.csv"
df = pd.read_csv('C:/Users/migue/Desktop/Datasets/mba_df.csv')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 91171152 entries, 0 to 91171151 Data columns (total 5 columns): # Column Dtype --- ------ ----- 0 ProductName_ID int64 1 Point-of-Sale_ID int64 2 Year int64 3 Quarter int64 4 TID object dtypes: int64(4), object(1) memory usage: 3.4+ GB
df.head()
| ProductName_ID | Point-of-Sale_ID | Year | Quarter | TID | |
|---|---|---|---|---|---|
| 0 | 649 | 1 | 2017 | 1 | 1162017-03-04 |
| 1 | 649 | 1 | 2016 | 2 | 1162016-05-02 |
| 2 | 649 | 1 | 2016 | 4 | 1162016-10-24 |
| 3 | 649 | 1 | 2017 | 4 | 1162017-10-13 |
| 4 | 649 | 1 | 2017 | 4 | 1162017-10-14 |
# Creating DF for Dashboard Visualization
#mba_dash = df[(df['Year']==2016)&(df['Quarter'].isin([1,2,3]))&(df['Point-of-Sale_ID'].isin([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]))]
#mba_dash.info()
#mba_dash.to_csv('C:/Users/migue/Desktop/mba_dash.csv',index=False)
orders = df[(df['Year']==2016)&(df['Quarter']==3)&(df['Point-of-Sale_ID']==3)]
orders.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 17806 entries, 370438 to 645177 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ProductName_ID 17806 non-null int64 1 Point-of-Sale_ID 17806 non-null int64 2 Year 17806 non-null int64 3 Quarter 17806 non-null int64 4 TID 17806 non-null object dtypes: int64(4), object(1) memory usage: 834.7+ KB
pt = pd.pivot_table(orders[['TID','ProductName_ID']], index='TID', columns='ProductName_ID',
aggfunc=lambda x: 1 if len(x)>0 else 0).fillna(0)
pt.head()
| ProductName_ID | 6 | 14 | 15 | 17 | 19 | 21 | 24 | 27 | 38 | 39 | ... | 2817 | 2834 | 2838 | 2839 | 2841 | 2845 | 2846 | 2847 | 2848 | 2850 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| TID | |||||||||||||||||||||
| 3102016-07-01 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3102016-07-02 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3102016-07-04 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3102016-07-05 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3102016-07-06 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 737 columns
pt.info()
<class 'pandas.core.frame.DataFrame'> Index: 1432 entries, 3102016-07-01 to 392016-09-30 Columns: 737 entries, 6 to 2850 dtypes: float64(737) memory usage: 8.1+ MB
frequent_itemsets = apriori(pt, min_support=0.05, use_colnames=True)
frequent_itemsets
| support | itemsets | |
|---|---|---|
| 0 | 0.054469 | (21) |
| 1 | 0.055866 | (78) |
| 2 | 0.058659 | (198) |
| 3 | 0.059358 | (200) |
| 4 | 0.059358 | (216) |
| ... | ... | ... |
| 749 | 0.050978 | (1408, 198, 2379, 2412, 567, 2399) |
| 750 | 0.050279 | (1408, 2379, 2412, 912, 567, 216) |
| 751 | 0.050978 | (1408, 2379, 2412, 567, 216, 2399) |
| 752 | 0.050279 | (1408, 198, 2379, 2412, 912, 567, 216) |
| 753 | 0.050978 | (1408, 198, 2379, 2412, 567, 216, 2399) |
754 rows × 2 columns
frequent_itemsets.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 754 entries, 0 to 753 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 support 754 non-null float64 1 itemsets 754 non-null object dtypes: float64(1), object(1) memory usage: 11.9+ KB
# Generate the association rules - by lift
rulesLift = association_rules(frequent_itemsets, metric="lift", min_threshold=0)
rulesLift.sort_values(by='confidence', ascending=False, inplace=True)
rulesLift
| antecedents | consequents | antecedent support | consequent support | support | confidence | lift | leverage | conviction | |
|---|---|---|---|---|---|---|---|---|---|
| 6303 | (1408, 912, 198, 2399) | (216) | 0.050279 | 0.059358 | 0.050279 | 1.0 | 16.847059 | 0.047295 | inf |
| 6496 | (216, 2378, 2412) | (1408, 198) | 0.050978 | 0.053771 | 0.050978 | 1.0 | 18.597403 | 0.048237 | inf |
| 2032 | (200, 2412, 198) | (216) | 0.051676 | 0.059358 | 0.051676 | 1.0 | 16.847059 | 0.048609 | inf |
| 6504 | (2378, 2412) | (1408, 198, 216) | 0.050978 | 0.053771 | 0.050978 | 1.0 | 18.597403 | 0.048237 | inf |
| 658 | (2378, 2412) | (198) | 0.050978 | 0.058659 | 0.050978 | 1.0 | 17.047619 | 0.047987 | inf |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2141 | (2379) | (200, 1408, 198) | 0.062849 | 0.050978 | 0.050279 | 0.8 | 15.693151 | 0.047075 | 4.745112 |
| 4074 | (2379) | (912, 2378, 216) | 0.062849 | 0.050978 | 0.050279 | 0.8 | 15.693151 | 0.047075 | 4.745112 |
| 6718 | (2379) | (219, 2412, 198, 567) | 0.062849 | 0.050978 | 0.050279 | 0.8 | 15.693151 | 0.047075 | 4.745112 |
| 1082 | (2379) | (912, 219) | 0.062849 | 0.050978 | 0.050279 | 0.8 | 15.693151 | 0.047075 | 4.745112 |
| 7557 | (2379) | (216, 219, 2412, 567) | 0.062849 | 0.050978 | 0.050279 | 0.8 | 15.693151 | 0.047075 | 4.745112 |
10422 rows × 9 columns
data_scatter = dict(type='scatter',
y=rulesLift['confidence'],
x=rulesLift['lift'],
#text=rulesLift.index,
#mode='markers',
#marker=dict(
#size=rulesLift['support'],
hovertemplate=#'Grand Prix: ' + df_racetracks["name_x"] + '<br>'
#'RuleID: ' + rulesLift.index + '<br>'+
'Lift: ' + rulesLift["lift"].astype(str) + '<br>'+
'Confidence: ' + rulesLift['confidence'].astype(str) + '<br>'+
'Support: ' + rulesLift['support'].astype(str) + '<br>'+
'Antecedents: ' + rulesLift['antecedents'].astype(str) + '<br>'+
'Consequents: ' + rulesLift['consequents'].astype(str) + '<br>'
'<extra></extra>',
#color=scatterdf['Avg Salary'], # set color equal to a variable
#color_continuous_scale='mint', # one of plotly colorscales
#showscale=False,
mode='markers',
marker=dict(size=8,
#size=rulesLift['support'],
color=rulesLift['support'],
colorscale='oranges',
showscale=True,
line_width=2),
)
layout = dict(
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
xaxis_title="Lift",
yaxis_title="Confidence"
)
fig = go.Figure(data=data_scatter,layout=layout)
fig.show()